{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 测试我们自己的算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "# 鸢尾花数据样本\n",
    "iris = datasets.load_iris()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "X = iris.data\n",
    "y = iris.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150, 4)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150,)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# train_test_split  训练和测试数据分离,大部分用来训练，一小部分用来验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
       "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
       "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 直接对X和y进行shuffle会导致X和y不再同步。既然X和y的行数是一样的，那么更妙的解决方案第对索引进行shuffle\n",
    "shuffle_indexes = np.random.permutation(len(X)) # 元素个数来作为随机索引的个数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([104,  75,  29, 124,  85,  12, 133, 115,  56,   4,  38,  60,  51,\n",
       "        66,  21,   9, 100,  55,  44,  47,  92,  96, 117, 109, 142, 129,\n",
       "       145, 108,   7, 120,  25,  50,   1,  88, 134,  15,  28, 125,   2,\n",
       "         6,  42,  69,  45,  64,  58,  11,  48, 140,  68, 126,  22, 127,\n",
       "        36,  78,  81,  71, 106, 118,  70,  43, 131,  72, 148,   8,  46,\n",
       "         5,  62,  79, 111,  16, 128,  84,  26,  53,  83,  20,  17,  89,\n",
       "        13,  35, 144, 138, 139, 122, 103, 107,  97,  10, 121, 123,  67,\n",
       "        94,  18,  95,  24,  39,  34,  61, 149, 112,  33,  37,  41, 135,\n",
       "       116,  77,  93,  63, 102,  82, 110, 146,  91,  57,  32, 119,  99,\n",
       "        31,  23,  98,  27,  54,  86, 114,   3, 136, 132,  49, 147,  14,\n",
       "        19, 101,   0,  80,  76,  65,  90, 141, 105, 113,  40,  52,  73,\n",
       "       143, 137,  59,  30,  74,  87, 130])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shuffle_indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "test_ratio = 0.2  # 选取多大比例的数据进行模型验证\n",
    "test_size = int(len(X)*test_ratio)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "30"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "test_indexes = shuffle_indexes[:test_size] # 取样本数据的前test_size个作为测试数据集\n",
    "train_indexes = shuffle_indexes[test_size:] # test_size后面的数据作为训练数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([104,  75,  29, 124,  85,  12, 133, 115,  56,   4,  38,  60,  51,\n",
       "        66,  21,   9, 100,  55,  44,  47,  92,  96, 117, 109, 142, 129,\n",
       "       145, 108,   7, 120])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 25,  50,   1,  88, 134,  15,  28, 125,   2,   6,  42,  69,  45,\n",
       "        64,  58,  11,  48, 140,  68, 126,  22, 127,  36,  78,  81,  71,\n",
       "       106, 118,  70,  43, 131,  72, 148,   8,  46,   5,  62,  79, 111,\n",
       "        16, 128,  84,  26,  53,  83,  20,  17,  89,  13,  35, 144, 138,\n",
       "       139, 122, 103, 107,  97,  10, 121, 123,  67,  94,  18,  95,  24,\n",
       "        39,  34,  61, 149, 112,  33,  37,  41, 135, 116,  77,  93,  63,\n",
       "       102,  82, 110, 146,  91,  57,  32, 119,  99,  31,  23,  98,  27,\n",
       "        54,  86, 114,   3, 136, 132,  49, 147,  14,  19, 101,   0,  80,\n",
       "        76,  65,  90, 141, 105, 113,  40,  52,  73, 143, 137,  59,  30,\n",
       "        74,  87, 130])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "# Fancy_indexes 获取训练数据集\n",
    "X_train = X[train_indexes]\n",
    "y_train = y[train_indexes]\n",
    "# Fancy_indexes 获取测试数据集\n",
    "X_test = X[test_indexes]\n",
    "y_test = y[test_indexes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(120, 4)\n",
      "(120,)\n"
     ]
    }
   ],
   "source": [
    "print(X_train.shape)\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(30, 4)\n",
      "(30,)\n"
     ]
    }
   ],
   "source": [
    "print(X_test.shape)\n",
    "print(y_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用我们自定义的算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "from playML.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "X_train, y_train, X_test, y_test = train_test_split(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(120, 4)\n",
      "(120,)\n"
     ]
    }
   ],
   "source": [
    "print(X_train.shape)\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(30, 4)\n",
      "(30,)\n"
     ]
    }
   ],
   "source": [
    "print(X_test.shape)\n",
    "print(y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "from playML.kNN import kNNClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "my_knn_clf = kNNClassifier(k=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "kNN(k = 3)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_knn_clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "y_predict = my_knn_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 1, 1, 2, 0, 2, 0, 2, 1, 1, 0, 0, 2, 2, 2, 0, 0, 1, 0, 2, 2, 1,\n",
       "       1, 1, 2, 1, 1, 2, 2])"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_predict # 预测结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 2, 1, 1, 2, 0, 2, 0, 2, 1, 1, 0, 0, 2, 2, 2, 0, 0, 1, 0, 2, 2, 1,\n",
       "       1, 1, 2, 1, 1, 2, 2])"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test   # 实际结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "29"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(y_predict == y_test)  # 看看两个数组中中多少相同地，也即预测正确的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.96666666666666667"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(y_predict == y_test)/len(y_test) # 获得正确率.结果其实是和前面选取的随机数组有关地"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# sklearn中的train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 注意 X_train, X_test, y_train, y_test的序列和上面的不同哈。要想设置随机种子，传入random_state参数即可\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 这里的test_size就相当于上面的test_ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(105, 4)\n",
      "(105,)\n"
     ]
    }
   ],
   "source": [
    "print(X_train.shape)\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(45, 4)\n",
      "(45,)\n"
     ]
    }
   ],
   "source": [
    "print(X_test.shape)\n",
    "print(y_test.shape)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
